Background

This module builds on code contained in Coronavirus_Statistics_USAF_v004.Rmd. This file includes the latest code for analyzing data from USA Facts. USA Facts maintains data on cases and deaths by county for coronavirus in the US. Downloaded data are unique by county with date as a column and a separate file for each of cases, deaths, and population.

The intent of this code is to source updated functions that allow for readRunUSAFacts() to be run to obtain, read, process, and analyze data from USA Facts.

Sourcing Functions

The tidyverse library is loaded, and the functions used for CDC daily processing are sourced. Additionally, specific functions for USA Facts are also sourced:

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# Functions are available in source file
source("./Generic_Added_Utility_Functions_202105_v001.R")
source("./Coronavirus_CDC_Daily_Functions_v001.R")
source("./Coronavirus_USAF_Functions_v001.R")

Further, the mapping file specific to USA Facts is sourced:

source("./Coronavirus_USAF_Default_Mappings_v001.R")

A few functions should be added back to Generic_Added_…v001.R and Coronavirus_CDC_Daily…_v001.R after they have been more thoroughly checked for compatibility with the state-based clustering. For now, they are included below so as to over-write the function obtained from source(…):

# Updated function for handling county-level clusters
createSummary <- function(df, 
                          stateClusterDF=NULL,
                          brewPalette=NA, 
                          dataType="state"
                          ) {
    
    # FUNCTION ARGUMENTS:
    # df: an integrated data frame by cluster-date
    # stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
    # brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
    # dataType: the type of maps being produced ("state" or "county")
    
    # Create plots that can be relevant for a dashboard, including:
    # 1. Map of segments
    # 2. Bar plot of counts by segment
    # 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
    # 4. Facetted trend-line plot of burden by segments
    
    # Create a map of the clusters
    p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF, 
                           mapLevel=if(dataType=="state") "states" else "counties",
                           keyCol=if(dataType=="state") "state" else "countyFIPS",
                           discreteValues=TRUE, 
                           labelScale=is.na(brewPalette), 
                           textLabel=if(dataType=="state") c("RI", "CT", "DE", "MD", "DC") else c(),
                           extraArgs=if(is.na(brewPalette)) list() else 
                               list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                           )
    
    # Create a bar plot of counts by segment
    p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF, 
                           mapLevel=if(dataType=="state") "states" else "counties",
                           keyCol=if(dataType=="state") "state" else "countyFIPS",
                           discreteValues=TRUE, 
                           labelScale=is.na(brewPalette), 
                           countOnly=TRUE,
                           extraArgs=if(is.na(brewPalette)) list() else 
                               list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                           )
    
    # Create plot for population and burden by cluster
    p3 <- df %>%
        helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"), 
                       mapper=c("pop"="Population (millions)", 
                                "wm_tcpm7"="Cases per thousand", 
                                "wm_tdpm7"="Deaths per million"
                                ), 
                       xLab=NULL, 
                       yLab=NULL, 
                       title=NULL,
                       divideBy=c("pop"=1000000, "wm_tcpm7"=1000), 
                       extraArgs=if(is.na(brewPalette)) list() else 
                           list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                       )
    
    # Create plot for cumulative burden per million over time
    p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"), 
                   arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
                   )
    if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
    p4 <- df %>%
        helperAggTrend(aggVars=append(c("wm_tcpm7", "wm_tdpm7"), if(dataType=="state") "wm_hpm7" else NULL), 
                       mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)", 
                                "wm_tdpm7"="Deaths per million\n(cumulative)", 
                                "wm_hpm7"="Hospitalized per million\n(current)"
                                ),
                       yLab=NULL,
                       title=NULL, 
                       divideBy=c("wm_tcpm7"=1000), 
                       linesize=0.75,
                       extraArgs=p4xtra
                       )
    
    list(p1=p1, p2=p2, p3=p3, p4=p4)
    
}



# Helper function to make a summary map
helperSummaryMap <- function(df, 
                             mapLevel="states", 
                             keyCol="state",
                             values="cluster",
                             discreteValues=NULL,
                             legend.position="right",
                             labelScale=TRUE,
                             extraArgs=list(),
                             countOnly=FALSE,
                             textLabel=c(),
                             ...
                             ) {
    
    # FUNCTION ARGUMENTS:
    # df: a data frame containing a level of geography and an associated cluster
    # mapLevel: a parameter for whether the map is "states" or "counties"
    # keyCol: the key column for plotting (usmap::plot_usmap is particular, and this must be 'state' or 'fips')
    # values: the character name of the field containing the data to be plotted
    # discreteValues: boolean for whether the values are discrete (if not, use continuous)
    #                 NULL means infer from data
    # legend.position: character for the location of the legend in the plot
    # labelScale: boolean, should an scale_fill_ be created?  Use FALSE if contained in extraArgs
    # extraArgs: list of other arguments that will be appended as '+' to the end of the usmap::plot_usmap call
    # countOnly: should a bar plot of counts only be produced?
    # textLabel: a list of elements that should be labelled as text on the plot (too small to see)
    # ...: other parameters to be passed to usmap::plot_usmap (e.g., labels, include, exclude, etc.)
    
    # Modify the data frame to contain only the relevant data
    df <- df %>%
        select(all_of(c(keyCol, values))) %>%
        distinct()
    
    # Determine the type of data being plotted
    if (is.null(discreteValues)) discreteValues <- !is.numeric(df[[values]])
    
    # Convert data type if needed
    if (isTRUE(discreteValues) & is.numeric(df[[values]])) 
        df[[values]] <- factor(df[[values]])
    
    # If count only is needed, create a count map; otherwise create a map
    if (isTRUE(countOnly)) { 
        gg <- df %>%
            ggplot(aes(x=fct_rev(get(values)))) + 
            geom_bar(aes_string(fill=values)) + 
            stat_count(aes(label=..count.., y=..count../2), 
                       geom="text", 
                       position="identity", 
                       fontface="bold"
                       ) +
            coord_flip() + 
            labs(y="Number of members", x="")
    } else {
        if(keyCol=="countyFIPS") {
            df <- df %>% colRenamer(vecRename=c("countyFIPS"="fips"))
            keyCol <- "fips"
        }
        gg <- usmap::plot_usmap(regions=mapLevel, data=df, values=values, ...)
        if (length(textLabel) > 0) {
            labDF <- df %>% 
                filter(get(keyCol) %in% textLabel) %>%
                mutate(rk=match(get(keyCol), textLabel)) %>%
                arrange(rk) %>%
                mutate(lon=-70.1-seq(0, 0.8*length(textLabel)-0.8, by=0.8), 
                       lat=40.1-seq(0, 1.5*length(textLabel)-1.5, by=1.5)
                       ) %>%
                select(lon, lat, everything()) %>%
                usmap::usmap_transform()
            gg <- gg + geom_text(data=labDF, 
                                 aes(x=lon.1, y=lat.1, label=paste(get(keyCol), get(values))), 
                                 size=3.25
                                 )
        }
    }
    
    # Position the legend as requested
    gg <- gg + theme(legend.position=legend.position)
    
    # Create the scale if appropriate
    if (isTRUE(labelScale)) gg <- gg + 
        if(isTRUE(discreteValues)) scale_fill_discrete(values) else scale_fill_continuous(values)
    
    # Apply extra arguments
    for (ctr in seq_along(extraArgs)) gg <- gg + extraArgs[[ctr]]
    
    # Return the map object
    gg
    
}



# Function to pivot the data file longer
pivotData <- function(df, 
                      pivotKeys, 
                      nameVar="name", 
                      valVar="value",
                      toLonger=TRUE, 
                      ...
                      ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame
    # pivotKeys: the keys (everything but cols for pivot_longer, id_cols for pivot_wider)
    # nameVar: variable name for names_to or names_from
    # valVar: variable name for values_to or values_from
    # toLonger: boolean, should pivot_longer() be used rather than pivot_wider()?
    # ...: other arguments to be passed to pivot_*()

    if (isTRUE(toLonger)) pivot_longer(df, -all_of(pivotKeys), names_to=nameVar, values_to=valVar, ...)
    else pivot_wider(df, all_of(pivotKeys), names_from=all_of(nameVar), values_from=all_of(valVar), ...)
    
}

An existing processed USA Facts list is loaded for use as the comparison set:

cty_newformat_20201026 <- readFromRDS("cty_newformat_20201026")

A function is added to create a county-level cluster map with state borders:

sparseCountyClusterMap <- function(vec, 
                                   clustRemap=c("999"=NA), 
                                   caption=NULL, 
                                   brewPalette=NULL, 
                                   naFill="white"
                                   ) {
    
    # FUNCTION ARGUMENTS:
    # vec: a named vector where the names are countyFIPS and the values are the clusters
    #      can also be a data.frame, in which case clustersToFrame() and clustRemap are not used
    # clustRemap: remapping vector for clusters
    # caption: caption to be included (NULL means no caption)
    # brewPalette: name of a palette for scale_fill_brewer()
    #              NULL means use scale_fill_discrete() instead
    # naFill: fill to use for NA counties
    
    # Convert to a tibble for use with usmap if not already a data.frame
    if ("data.frame" %in% class(vec)) {
        df <- vec
    } else {
        df <- clustersToFrame(vec, colNameName="fips", convFactor=FALSE) %>%
            mutate(cluster=as.character(cluster), 
                   cluster=ifelse(cluster %in% names(clustRemap), clustRemap[cluster], cluster)
                   )
    }
    
    # Create a blank state map with black lines
    blankStates <- usmap::plot_usmap("states")
    
    # Create a county cluster map with NA values excluded
    dataCounties <- df %>% 
        filter(!is.na(cluster)) %>% 
        usmap::plot_usmap(regions="counties", data=., values="cluster")
    
    # Integrate as a ggplot object
    p1 <- ggplot() + 
        geom_polygon(data=dataCounties[[1]], 
                     aes(x=x, y=y, group=group, fill=dataCounties[[1]]$cluster), 
                     color = NA,
                     size = 0.1
                     ) +  
        geom_polygon(data=blankStates[[1]], 
                     aes(x=x, y=y, group=group), 
                     color = "black", 
                     lwd=1.25,
                     fill = alpha(0.001)
                     ) + 
        coord_equal() + 
        ggthemes::theme_map()
    
    # Add the appropriate fill (can use viridis_d also)
    if (is.null(brewPalette)) p1 <- p1 + scale_fill_discrete("Cluster", na.value=naFill)
    else if (brewPalette=="viridis") p1 <- p1 + scale_fill_viridis_d("Cluster", na.value=naFill)
    else p1 <- p1 + scale_fill_brewer("Cluster", palette=brewPalette, na.value=naFill)
    
    # Add caption if requested
    if (!is.null(caption)) p1 <- p1 + labs(caption=caption)
    
    # Return the plotting object
    p1
    
}

The function is then run using previously created segments:

sparseCountyClusterMap(cty_newformat_20201026$useClusters, 
                       brewPalette="Paired", 
                       caption="Counties with population under 25k are blank"
                       )

Next steps are to load new data and compare against previous, while using existing segments:

readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210608.csv", 
                 "usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210608.csv"
                 )
compareList <- list("usafCase"=cty_newformat_20201026$dfRaw$usafCase, 
                    "usafDeath"=cty_newformat_20201026$dfRaw$usafDeath
                    )

# Create new clusters
cty_newdata_20210608 <- readRunUSAFacts(maxDate="2021-06-06", 
                                        downloadTo=lapply(readList, 
                                                          FUN=function(x) if(file.exists(x)) NA else x
                                                          ),
                                        readFrom=readList, 
                                        compareFile=compareList, 
                                        writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log", 
                                        ovrwriteLog=TRUE,
                                        useClusters=cty_newformat_20201026$useClusters,
                                        skipAssessmentPlots=FALSE,
                                        brewPalette="Paired"
                                        )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
## 
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS 
## 
## 
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 225
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
## 
## Checking for similarity of: county
## In reference but not in current: 00001 02270 06000
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 14 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 551 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.

## 
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS 
## 
## 
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 225
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
## 
## Checking for similarity of: county
## In reference but not in current: 00001 02270 06000
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 29 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log

## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 640 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
## 
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
##   isType   cases new_cases            n
##   <chr>    <dbl>     <dbl>        <dbl>
## 1 before 6.20e+9   3.28e+7 1602886     
## 2 after  6.18e+9   3.27e+7 1577284     
## 3 pctchg 4.58e-3   3.84e-3       0.0160
## 
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
##   isType  deaths   new_deaths            n
##   <chr>    <dbl>        <dbl>        <dbl>
## 1 before 1.26e+8 590404       1602886     
## 2 after  1.25e+8 589015       1577284     
## 3 pctchg 3.75e-3      0.00235       0.0160

## NULL

sparseCountyClusterMap(cty_newdata_20210608$useClusters, 
                       brewPalette="Paired", 
                       caption="Counties with population under 25k are blank"
                       )

There has been significant convergence among segments in average deaths per million and cases per million. This is suggestive of several possibilities, such as that growth in burden may be inversely proportional to previous burden. Next steps are to create segments using the most recent data, seeking to identify differences in 1) cumulative burden, primarily defined by deaths, and 2) shape of the curve in getting to cumulative burden.

New segments are created, with assessments:

readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210608.csv", 
                 "usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210608.csv"
                 )

# Create new clusters
cty_newsegs_20210608 <- readRunUSAFacts(maxDate="2021-06-06", 
                                        writeLog="./RInputFiles/Coronavirus/USAF_NewSegs_20210608_v005.log", 
                                        ovrwriteLog=TRUE,
                                        dfPerCapita=cty_newdata_20210608$dfPerCapita,
                                        useClusters=NULL,
                                        skipAssessmentPlots=FALSE,
                                        brewPalette="Paired", 
                                        defaultCluster="999",
                                        minPopCluster=40000, 
                                        hierarchical=NA,
                                        minShape="2020-04",
                                        maxShape="2021-05",
                                        ratioDeathvsCase = 5,
                                        ratioTotalvsShape = 0.5,
                                        minDeath=100,
                                        minCase=5000, 
                                        hmlSegs=3, 
                                        eslSegs=3, 
                                        seed=2106091243
                                        )
## 
## *** File has been checked for uniqueness by: countyFIPS date

## NULL

sparseCountyClusterMap(cty_newsegs_20210608$useClusters, 
                       brewPalette="Paired", 
                       caption="Counties with population under 40k are blank"
                       )

The function createSummary() is updated to 1) create the sparse summary map; and 2) exclude the small county segment from select plots:

# Function to create diagnoses and plots for clustering data
diagnoseClusters <- function(lst, 
                             lstExtract=fullListExtract,
                             clusterFrame=NULL,
                             brewPalette=NA, 
                             clusterType="state",
                             printSummary=TRUE, 
                             printDetailed=TRUE
                             ) {
    
    # FUNCTION ARGUMENTS:
    # lst: a list containing processed clustering data
    # lstExtract: the elements to extract from lst with an optional function for converting the elements
    #             NULL means use the extracted element as-is
    # clusterFrame: tibble of the clusters to be plotted
    #               NULL means create from lst
    # brewPalette: the color palette to use with scale_*_brewer()
    #              default NA means use the standard color/fill profile
    # clusterType: character variable of form "state" for state clusters and "county" for county
    # printSummary: boolean, should summary plots be printed to the log?
    # printDetailed: boolean, should detailed plots be printed to the log?
    
    # Get the key variable (used for joins and the like)
    if (clusterType=="state") keyVar <- "state"
    else if (clusterType=="county") keyVar <- "countyFIPS"
    else stop(paste0("\nThe passed clusterType: ", clusterType, " is not programmed\n"))
    
    # Create clusterFrame from lst if it has been passed as NULL
    if (is.null(clusterFrame)) clusterFrame <- clustersToFrame(lst, colNameName=keyVar)
    
    # Create the integrated and aggregate data from lst
    dfFull <- integrateData(lst, lstExtract=lstExtract, otherDF=list(clusterFrame), keyJoin=keyVar)
    dfAgg <- combineAggData(dfFull, aggBy=plotCombineAggByMapper[[clusterType]])
    
    # Create the main summary plots
    summaryPlots <- createSummary(dfAgg, 
                                  stateClusterDF=clusterFrame, 
                                  brewPalette=brewPalette, 
                                  dataType=clusterType
    )
    
    # Create the detailed summaries
    detPlots <- createDetailedSummaries(dfDetail=dfFull, 
                                        dfAgg=dfAgg, 
                                        detVar=keyVar,
                                        p2DetMetrics=plotCombineAggByMapper[[clusterType]]$agg2$aggVars,
                                        brewPalette=brewPalette
    )
    
    # Print the summary plots if requested (use the sparse county plotting)
    if (isTRUE(printSummary)) {
        gridExtra::grid.arrange(summaryPlots$p5 + theme(legend.position="none"), 
                                summaryPlots$p3 + theme(legend.position="left"), 
                                summaryPlots$p4, 
                                layout_matrix=rbind(c(1, 2), 
                                                    c(3, 3)
                                )
        )
    }
    
    # Print the detailed plots if requested
    if (isTRUE(printDetailed)) purrr::walk(detPlots, .f=print)
    
    # Return a list of the key plotting files
    list(dfFull=dfFull, 
         dfAgg=dfAgg, 
         plotClusters=clusterFrame, 
         summaryPlots=summaryPlots, 
         detPlots=detPlots
    )
    
}



# Function to create detailed cluster summaries
createDetailedSummaries <- function(dfDetail, 
                                    dfAgg, 
                                    aggVar=c("cluster"), 
                                    detVar=c("state"),
                                    p1Metrics=c("tcpm", "tdpm"), 
                                    p1Order=c("tdpm"), 
                                    p2DetMetrics=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"),
                                    p2AggMetrics=paste0("wm_", p2DetMetrics),
                                    p3Metrics=p1Metrics,
                                    p3Days=30,
                                    p3Slope=0.25,
                                    mapper=c("tcpm"="Cases per million\n(cumulative)", 
                                             "tdpm"="Deaths per million\n(cumulative)", 
                                             "cpm7"="Cases\nper million", 
                                             "dpm7"="Deaths\nper million",
                                             "hpm7"="Hospitalized\nper million",
                                             "tdpm7"="Deaths (cum)\nper million",
                                             "tcpm7"="Cases (cum)\nper million"
                                    ),
                                    brewPalette=NA
                                    ) {
    
    # FUNCTION ARGUMENTS:
    # dfDetail: data frame or tibble containing detailed (sub-cluster) data
    # dfAgg: data frame or tibble containing aggregated (cluster) data
    # aggVar: variable reflecting the aggregate level
    # detVar: variable reflecting the detailed level
    # p1Metrics: metrics to be shown for plot 1 (will be faceted)
    # p1Order: variable for ordering detVar in p1
    # p2DetMetrics: variables to be included from dfDetail for p2
    # p2AggMetrics: corresponding variables to be included from dfAgg for p2
    # p3Metrics: metrics to be included in the growth plots
    # p3Days: number of days to include for calculating growth
    # p3Slope: the slope for the dashed line for growth in p3
    # mapper mapping file for variable name to descriptive name
    # brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)    
    
    # Create plot for aggregates by sub-cluster
    if(detVar=="state") {
        p1 <- dfDetail %>%
            colSelector(vecSelect=c(detVar, aggVar, "date", p1Metrics)) %>%
            pivot_longer(all_of(p1Metrics)) %>%
            filter(!is.na(value)) %>%
            group_by_at(detVar) %>%
            filter(date==max(date)) %>%
            ungroup() %>%
            ggplot(aes(x=fct_reorder(get(detVar), 
                                     value, 
                                     .fun=function(x) { max(ifelse(name==p1Order, x, 0)) }
                                     ),
                       y=value
                       )
                   ) + 
            geom_col(aes(fill=get(aggVar))) + 
            facet_wrap(~mapper[name], scales="free_x") + 
            coord_flip() + 
            labs(title="Cumulative burden", x=NULL, y=NULL)
        if (!is.na(brewPalette)) p1 <- p1 + scale_fill_brewer("Cluster", palette=brewPalette)
    } else {
        # Do not create the plot for other than state-level data
        p1 <- NULL
    }
    
    # Create facetted burden trends by aggregate
    # For state-level, create each state as a line
    # For anything else, create a 10%-90% range
    if (detVar=="state") {
        p2 <- dfDetail %>%
            colSelector(vecSelect=c(detVar, "date", aggVar, p2DetMetrics)) %>%
            pivot_longer(all_of(p2DetMetrics)) %>%
            filter(!is.na(value)) %>%
            ggplot(aes(x=date, y=value)) + 
            geom_line(aes_string(group=detVar), color="grey", size=0.5) + 
            facet_grid(mapper[name] ~ get(aggVar), scales="free_y") + 
            scale_x_date(date_breaks="2 months", date_labels="%b-%y") + 
            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
            labs(x=NULL, y=NULL, title="Burden by cluster and metric")
    } else {
        p2 <- dfDetail %>%
            colSelector(vecSelect=c(detVar, "date", aggVar, p2DetMetrics)) %>%
            pivot_longer(all_of(p2DetMetrics)) %>%
            filter(!is.na(value)) %>%
            group_by_at(c(aggVar, "name", "date")) %>%
            summarize(p10=unname(quantile(value, 0.1)), p90=unname(quantile(value, 0.9)), .groups="drop") %>%
            ggplot(aes(x=date)) + 
            geom_ribbon(aes(ymin=p10, ymax=p90), alpha=0.75) +
            facet_grid(mapper[name] ~ get(aggVar), scales="free_y") + 
            scale_x_date(date_breaks="2 months", date_labels="%b-%y") + 
            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
            labs(x=NULL, 
                 y=NULL, 
                 title="Burden by cluster and metric", 
                 subtitle="Shaded region is 10%le through 90%le, solid line is weighted mean"
                 )
    }
    aggPlot <- dfAgg %>% 
        colSelector(vecSelect=c("date", aggVar, p2AggMetrics)) %>%
        colRenamer(vecRename=purrr::set_names(p2DetMetrics, p2AggMetrics)) %>%
        pivot_longer(all_of(p2DetMetrics)) %>%
        filter(!is.na(value))
    p2 <- p2 + 
        geom_line(data=aggPlot, aes_string(color=aggVar, group=aggVar, y="value"), size=1.5)
    if (!is.na(brewPalette)) p2 <- p2 + 
        scale_color_brewer("Cluster", palette=brewPalette) + 
        theme(legend.position="none")
    
    # Create growth trends plot
    if (TRUE) {
        if(detVar=="countyFIPS") p3 <- dfDetail %>% filter(cluster != "999") else p3 <- dfDetail
        p3 <- p3 %>%
            colSelector(vecSelect=c(detVar, aggVar, "date", p3Metrics)) %>%
            pivot_longer(all_of(p3Metrics)) %>%
            filter(!is.na(value)) %>%
            group_by_at(c(detVar, "name")) %>%
            filter(date %in% c(max(date), max(date)-lubridate::days(p3Days))) %>%
            mutate(growth=max(value)-min(value)) %>%  # not ideal way to calculate
            filter(date==max(date)) %>%
            ungroup() %>%
            ggplot(aes(x=value, y=growth))
        if(detVar=="state") p3 <- p3 + geom_text(aes_string(color=aggVar, label=detVar), fontface="bold")
        else p3 <- p3 + geom_point(aes_string(color=aggVar))
        p3 <- p3 + 
            facet_wrap(~mapper[name], scales="free") + 
            labs(title=paste0("Current vs growth"), 
                 subtitle=paste0("Dashed line represents ", 
                                 round(100*p3Slope), 
                                 "% growth rate over past ", 
                                 p3Days, 
                                 " days"
                                 ),
                 x="Most recent cumulative", 
                 y=paste0("Growth over past ", p3Days, " days")
                 ) + 
            lims(y=c(0, NA), x=c(0, NA)) + 
            theme(panel.background = element_rect(fill = "white", colour = "white"), 
                  panel.grid.major = element_line(size = 0.25, linetype = 'solid', color = "grey")
                  ) + 
            geom_abline(slope=p3Slope, intercept=0, lty=2)
        if (!is.na(brewPalette)) { 
            p3 <- p3 + scale_color_brewer(stringr::str_to_title(aggVar), palette=brewPalette)
        }
        if(detVar=="countyFIPS") p3 <- p3 + labs(caption="Counties below population threshold excluded")
    } else {
        p3 <- NULL
    }
    
    # Return a list of plot objects
    list(p1=p1, p2=p2, p3=p3)
    
}



# Updated function for handling county-level clusters
createSummary <- function(df, 
                          stateClusterDF=NULL,
                          brewPalette=NA, 
                          dataType="state"
                          ) {
    
    # FUNCTION ARGUMENTS:
    # df: an integrated data frame by cluster-date
    # stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
    # brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
    # dataType: the type of maps being produced ("state" or "county")
    
    # Create plots that can be relevant for a dashboard, including:
    # 1. Map of segments
    # 2. Bar plot of counts by segment
    # 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
    # 4. Facetted trend-line plot of burden by segments
    # 5. Sparse county cluster map (if dataType is "county")
    
    # Create a map of the clusters
    p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF, 
                           mapLevel=if(dataType=="state") "states" else "counties",
                           keyCol=if(dataType=="state") "state" else "countyFIPS",
                           discreteValues=TRUE, 
                           labelScale=is.na(brewPalette), 
                           textLabel=if(dataType=="state") c("RI", "CT", "DE", "MD", "DC") else c(),
                           extraArgs=if(is.na(brewPalette)) list() else 
                               list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                           )
    
    # Create a bar plot of counts by segment
    p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF, 
                           mapLevel=if(dataType=="state") "states" else "counties",
                           keyCol=if(dataType=="state") "state" else "countyFIPS",
                           discreteValues=TRUE, 
                           labelScale=is.na(brewPalette), 
                           countOnly=TRUE,
                           extraArgs=if(is.na(brewPalette)) list() else 
                               list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                           )
    
    # Create plot for population and burden by cluster
    p3 <- df %>%
        helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"), 
                       mapper=c("pop"="Population (millions)", 
                                "wm_tcpm7"="Cases per thousand", 
                                "wm_tdpm7"="Deaths per million"
                                ), 
                       xLab=NULL, 
                       yLab=NULL, 
                       title=NULL,
                       divideBy=c("pop"=1000000, "wm_tcpm7"=1000), 
                       extraArgs=if(is.na(brewPalette)) list() else 
                           list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                       )
    
    # Create plot for cumulative burden per million over time
    p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"), 
                   arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
                   )
    if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
    p4 <- df %>%
        helperAggTrend(aggVars=append(c("wm_tcpm7", "wm_tdpm7"), if(dataType=="state") "wm_hpm7" else NULL), 
                       mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)", 
                                "wm_tdpm7"="Deaths per million\n(cumulative)", 
                                "wm_hpm7"="Hospitalized per million\n(current)"
                                ),
                       yLab=NULL,
                       title=NULL, 
                       divideBy=c("wm_tcpm7"=1000), 
                       linesize=0.75,
                       extraArgs=p4xtra
                       )
    
    # Create the sparse county plot (if it is county data) - assumes default that '999' is 'too small'
    if (dataType=="county") {
        vecFrame <- if(is.null(stateClusterDF)) df else stateClusterDF
        vecFrame <- vecFrame %>% select(countyFIPS, cluster) %>%
            distinct()
        vecUse <- vecFrame$cluster %>% purrr::set_names(vecFrame$countyFIPS)
        p5 <- sparseCountyClusterMap(vecUse, 
                                     caption="Counties below population threshold left blank",
                                     brewPalette=if(is.na(brewPalette)) NULL else brewPalette
                                     )
    } else {
        p5 <- NULL
    }
    
    list(p1=p1, p2=p2, p3=p3, p4=p4, p5=p5)
    
}

The updated functions are tested:

# Confirm that new cluster maps are working as intended
cty_chksegs_20210608 <- readRunUSAFacts(maxDate="2021-06-06", 
                                        dfPerCapita=cty_newsegs_20210608$dfPerCapita,
                                        useClusters=cty_newsegs_20210608$useClusters,
                                        skipAssessmentPlots=FALSE, 
                                        brewPalette="Paired"
                                        )

## NULL

The updated maps blank out the ‘999’ (small population) cluster as desired. The clusters are designed using a 3x3 of total burden (mainly death) and shape of the curve. Shapes of the curve are plotted, normalized so that each curve sums to 100%:

p1Data <- cty_chksegs_20210608$plotDataList$dfAgg %>% 
    pivotData(pivotKeys=c("cluster", "date", "pop")) %>% 
    filter(!is.na(value), cluster!="999", name %in% c("wm_cpm7", "wm_dpm7")) %>% 
    group_by(cluster, name) %>% 
    mutate(pct=value/sum(value)) %>% 
    ungroup()

# Plot all together
p1Data %>% 
    ggplot(aes(x=date, y=pct)) + 
    geom_line(aes(group=cluster, color=cluster), lwd=1) + 
    facet_wrap(~c("wm_cpm7"="Rolling 7 cases", "wm_dpm7"="Rolling 7 deaths")[name]) + 
    scale_color_brewer(palette="Paired") + 
    labs(x=NULL, 
         y="Percentage of total burden", 
         main="Burden over time, normalized to 100% cumulative burden"
         )

# Plot as facets
p1Data %>% 
    ggplot(aes(x=date, y=pct)) + 
    geom_line(aes(group=cluster, color=cluster), lwd=1) + 
    facet_grid(c("wm_cpm7"="Rolling 7 cases", "wm_dpm7"="Rolling 7 deaths")[name]~cluster) + 
    scale_color_brewer(palette="Paired") + 
    labs(x=NULL, 
         y="Percentage of total burden", 
         main="Burden over time, normalized to 100% cumulative burden"
         )

Broadly, the shapes selected by the clustering include:

  1. Two spikes, primary impact early
  2. Two spikes, balanced early and late impact
  3. One main spike, late

Broadly, there are examples of relatively higher and lower burden within each of these shapes, though the highest burden clusters mainly had two spikes with primary impact early and the medium burden clusters rarely had the primary impact early.

Next, all counties of at least 50,000 population are examined for the shape of the cumulative deaths curve:

p2Data <- cty_chksegs_20210608$plotDataList$dfFull %>% 
    select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
    pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>% 
    filter(!is.na(value), pop>=50000) %>% 
    group_by(countyFIPS, name) %>% 
    mutate(pct=value/max(value)) %>% 
    ungroup()

p2Dist <- p2Data %>% 
    filter(name=="tdpm7") %>% 
    select(countyFIPS, date, pct) %>% 
    pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="pct") %>%
    column_to_rownames("countyFIPS") %>%
    as.matrix() %>%
    dist()

p2Complete <- hclust(p2Dist, method="complete")
plot(p2Complete)

p2Single <- hclust(p2Dist, method="single")
plot(p2Single)

There is something peculiar about data in ‘32510’. A plot is created:

p2Data %>%
    ggplot(aes(x=date, y=pct)) +
    geom_line(data=~filter(., countyFIPS=="32510"), aes(group=countyFIPS), color="red", lwd=2) + 
    geom_line(data=~summarize(group_by(., name, date), pct=mean(pct), .groups="drop")) +
    facet_wrap(~c("tcpm7"="Rolling 7 cases", "tdpm7"="Rolling 7 deaths")[name]) + 
    labs(x=NULL, y="Percentage of total burden on day", title="Cluster 32510 is red, mean is black")

There is clearly a data issue with cluster 32510. Clusters are examined at a variable number of cut points:

plotHierarchical <- function(obj, k, df) {
    
    # FUNCtION ARGuMENTS:
    # obj: clustering object from hierarchical clustering
    # k: number of clusters to create
    # df: data frame with burden data
    
    # Counts by cluster
    ctCluster <- cutree(obj, k=k) %>%
        table() %>%
        print()
    
    # Make clustering data frame
    clData <- cutree(obj, k=k) %>%
        clustersToFrame(colNameName="countyFIPS") %>%
        joinFrames(df) %>%
        group_by(countyFIPS, name) %>%
        mutate(delta=ifelse(row_number()==1, pct, pct-lag(pct))) %>%
        ungroup() %>%
        group_by(cluster, date, name) %>%
        summarize(p05cum=quantile(pct, 0.05), 
                  mucum=mean(pct), 
                  p95cum=quantile(pct, 0.95), 
                  p05delta=quantile(delta, 0.05), 
                  mudelta=mean(delta), 
                  p95delta=quantile(delta, 0.95), 
                  .groups="drop"
                  )
    
    # Plot by cluster and metric
    p1 <- clData %>%
        ggplot(aes(x=date)) + 
        geom_line(aes(color=cluster, y=mucum), lwd=1) + 
        geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
        facet_grid(c("tcpm7"="% cumulative cases", "tdpm7"="% cumulative deaths")[name]~cluster) + 
        scale_color_brewer(palette="Set1") + 
        labs(x=NULL, y="% of cumulative", title="Mean and 90% range by cluster and metric")
    print(p1)
    
    # Plot incremental rather than cumulative
    p2 <- clData %>%
        ggplot(aes(x=date)) + 
        geom_line(aes(color=cluster, y=mudelta), lwd=1) + 
        geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
        facet_grid(c("tcpm7"="% total cases", "tdpm7"="% total deaths")[name]~cluster) + 
        scale_color_brewer(palette="Set1") + 
        labs(x=NULL, y="% of total", title="Mean and 90% range by cluster and metric")
    print(p2)
    
    # Return the clustering frame
    clData
    
}

plotList <- lapply(2:5, FUN=function(x) plotHierarchical(p2Complete, k=x, df=p2Data))
## .
##   1   2 
## 841 149
## Joining, by = "countyFIPS"

## .
##   1   2   3 
## 773  68 149
## Joining, by = "countyFIPS"

## .
##   1   2   3   4 
## 498 275  68 149
## Joining, by = "countyFIPS"

## .
##   1   2   3   4   5 
## 498 275  68 114  35
## Joining, by = "countyFIPS"

At a glance, with 5 clusters, there is a clear early cluster, a clear early/late cluster, and a group of three seemingly similar clusters (primarily late). Further exploration of these three clusters may be merited.

The five clusters are examined on a single plot:

tempPlotter <- function(df, vecLate, facetScales="free_y", showRibbon=TRUE, showCum=FALSE) {

    p1 <- df %>% 
        mutate(lateCluster=(cluster %in% vecLate)) %>%
        ggplot(aes(x=date)) + 
        geom_line(aes(group=cluster, color=cluster, y=if(showCum) mucum else mudelta), lwd=1) + 
        facet_grid(lateCluster~c("tcpm7"="% total cases", "tdpm7"="% total deaths")[name], 
                   scales=facetScales
                   ) + 
        scale_color_brewer(palette="Set1") + 
        labs(x=NULL, 
             y=NULL, 
             title="Disease burden by time period and shape cluster"
             )
    
    if (isTRUE(showRibbon)) {
        p1 <- p1 + 
            geom_ribbon(aes(ymin=if(showCum) p05cum else p05delta, 
                            ymax=if(showCum) p95cum else p95delta, 
                            fill=cluster, 
                            group=cluster
                            ), alpha=0.25) + 
            scale_fill_brewer(palette="Set1") + 
            labs(subtitle="Shaded region covers 90% of observations")
    }

    p1
    
}

tempPlotter(plotList[[4]], vecLate=c(4, 5), showRibbon=FALSE)

tempPlotter(plotList[[4]], vecLate=c(4, 5), showRibbon=FALSE, showCum=TRUE)

tempPlotter(plotList[[4]], vecLate=c(4, 5))

tempPlotter(plotList[[4]], vecLate=c(4, 5), showCum=TRUE)

Visually, there are meaningful distinctions in the clusters when overlaid, particularly with the shape of the cumulative curve:

The geographic locations of the five hierarchical cluster types are plotted:

tempMapper <- function(obj, 
                       k, 
                       regions="counties", 
                       varName=if(regions=="counties") "fips" else "state", 
                       values="cluster", 
                       lvlOrder=NULL, 
                       title=if(is.null(lvlOrder)) NULL else "Lightest colors have earliest burden",
                       caption=NULL
                       ) {
    
    df <- cutree(obj, k=k) %>%
        clustersToFrame(colNameName=varName) 
    
    if (!is.null(lvlOrder)) df[[values]] <- factor(df[[values]], levels=lvlOrder)
    
    df %>%
        sparseCountyClusterMap(brewPalette="viridis", caption=caption) + 
        labs(title=title)
    
}

tmpText <- "Plots only counties with at least 50k population"
tempMapper(p2Complete, k=2, lvlOrder=c(1, 2), caption=tmpText)

tempMapper(p2Complete, k=3, lvlOrder=c(2, 1, 3), caption=tmpText)

tempMapper(p2Complete, k=4, lvlOrder=c(3, 2, 1, 4), caption=tmpText)

tempMapper(p2Complete, k=5, lvlOrder=c(3, 2, 1, 4, 5), caption=tmpText)

Maps are created to show when counties first cleared specific hurdles for cumulative deaths per million:

tempBurdenMapper <- function(df, 
                             popMin=1, 
                             popMax=Inf, 
                             tgtVar="tdpm", 
                             tgtBPM=1000, 
                             fn=lubridate::quarter, 
                             title=paste0("Time when county first hit ", tgtBPM, " on metric ", tgtVar), 
                             caption=paste("Plots counties with population between", popMin, "and", popMax),
                             ...
                             ) {
    
    p1 <- df %>% 
        filter(pop < popMax, pop >= popMin) %>% 
        group_by(countyFIPS) %>% 
        summarize(bpmMax=max(get(tgtVar), na.rm=TRUE), 
                  keyDate=as.Date(specNA(min)(ifelse(get(tgtVar) >= tgtBPM, date, NA)), origin="1970-01-01"), 
                  .groups="drop"
                  ) %>% 
        mutate(cluster=fn(keyDate, ...), 
               cluster=ifelse(is.na(cluster), "not yet", cluster), 
               fips=countyFIPS
               ) %>% 
        sparseCountyClusterMap(brewPalette = "viridis", caption=caption)
    
    p1 + labs(title=title)
    
}

# Looking at tdpm of 500, 1000, 5000
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=500, with_year=TRUE)

tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=1000, with_year=TRUE)

tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=5000, with_year=TRUE)

# Looking specifically at 2500 tdpm
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=2500, with_year=TRUE)

tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, popMin=25000, tgtBPM=2500, with_year=TRUE)

tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, popMax=25000, tgtBPM=2500, with_year=TRUE)

Next steps are to create curves using larger counties, then assess how well they fit the shape of the curves in the smaller counties.

A function is created to calculate the distances from each of the cluster means:

tempDistCluster <- function(dfClust, dfAll, metType="tdpm7", metClust="mucum") {
    
    dfClust <- dfClust %>%
        filter(name==metType) %>%
        colSelector(vecSelect=c("cluster", "date", metClust)) %>%
        colRenamer(vecRename=c("pct") %>% purrr::set_names(metClust))
    
    dfAll <- dfAll %>%
        filter(name==metType) %>%
        colSelector(vecSelect=c("countyFIPS", "date", "pct"))
    
    dfClust %>%
        left_join(dfAll, by="date") %>%
        mutate(delta2=(ifelse(is.na(pct.x), 0, pct.x)-ifelse(is.na(pct.y), 0, pct.y))**2) %>%
        group_by(cluster, countyFIPS) %>%
        summarize(dist=sum(delta2), .groups="drop") %>%
        arrange(countyFIPS, dist)
    
}

# Create data for all counties
p2All <- cty_chksegs_20210608$plotDataList$dfFull %>% 
    select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
    pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>% 
    filter(!is.na(value), pop>0) %>% 
    group_by(countyFIPS, name) %>% 
    mutate(pct=value/max(value)) %>% 
    ungroup()

# Calculate distances to cluster mean, then print
countyClusterDist <- tempDistCluster(dfClust=plotList[[4]], dfAll=p2All)
countyClusterDist
## # A tibble: 15,710 x 3
##    cluster countyFIPS   dist
##    <ord>   <chr>       <dbl>
##  1 1       01001       1.20 
##  2 2       01001       3.10 
##  3 3       01001       5.48 
##  4 4       01001      14.8  
##  5 5       01001      48.8  
##  6 2       01003       0.938
##  7 1       01003       1.88 
##  8 3       01003       7.53 
##  9 4       01003      19.3  
## 10 5       01003      58.0  
## # ... with 15,700 more rows
# Assess overlap of cluster for counties already clusterd
tmpDistDF <- cutree(p2Complete, k=5) %>%
    vecToTibble(colNameName="countyFIPS", colNameData="hierCluster") %>%
    left_join(countyClusterDist, by="countyFIPS") %>%
    arrange(countyFIPS, dist)
tmpDistDF %>%
    group_by(countyFIPS) %>%
    filter(row_number()==1) %>%
    ungroup() %>%
    mutate(cluster=as.integer(as.character(cluster))) %>%
    count(hierCluster, cluster) %>%
    group_by(hierCluster) %>%
    mutate(pct=n/sum(n)) %>%
    ungroup() %>%
    ggplot(aes(x=cluster, y=hierCluster)) + 
    geom_tile(aes(fill=pct)) +
    geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) + 
    scale_fill_continuous(low="white", high="lightgreen") +
    labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean", 
         x="Closest hierarchical cluster mean (k=5)", 
         y="Assigned hierarchical cluster (k=5)"
         )

Most counties are closest to the mean of the hierarchical cluster they are assigned to. There are some differences, as expected since method=“complete” was chosen for the hierarchical clusters. Assignments are then made for all counties, and then plotted:

# Full clustering database
distOnlyClust <- countyClusterDist %>%
    arrange(countyFIPS, dist) %>%
    group_by(countyFIPS) %>%
    filter(row_number()==1) %>%
    ungroup() %>%
    left_join(cutree(p2Complete, k=5) %>% 
                  vecToTibble(colNameName="countyFIPS", colNameData="hierCluster"), 
              by="countyFIPS"
              )

# Cluster assignments depending on size of county
distOnlyClust %>%
    ggplot(aes(x=cluster)) + 
    geom_bar(fill="lightblue") + 
    geom_text(stat="count", aes(y=..count../2, label=..count..)) +
    facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"), 
               scales="free_y"
               ) + 
    labs(title="Closest hierarchical cluster")

# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust, fips=countyFIPS) %>% 
                           mutate(cluster=factor(cluster, levels=c(3, 2, 1, 4, 5))), 
                       brewPalette="viridis", 
                       caption="Closest hierarchical cluster mean (euclidean distance of % cumulative death)"
                       ) + 
    labs(title="Lighter clusters have higher percentage of burden early")

There are clear geographic patterns to the data, with counties having earlier disease generally being surrounded by other counties with (relatively) earlier disease.

The latest burden is also included:

p2Merge <- p2All %>%
    filter(date==max(date), name=="tdpm7") %>%
    select(countyFIPS, pop, state, date, tdpm7=value) %>%
    inner_join(distOnlyClust, by=c("countyFIPS"))

p2Merge %>%
    mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
    ggplot(aes(x=tdpm7)) + 
    geom_density(aes(color=cluster), size=1) + 
    facet_wrap(~countySize) + 
    scale_color_brewer(palette="Set1") + 
    labs(x="Cumulative deaths per million", y="Relative density", title="Deaths vs shape segments")

Smaller counties are more likely to be very low or very high on deaths per million, as expected given the smaller denominators. Among the larger counties, segment 5 appears to generally have higher deaths, a trend that does not appear to hold in smaller counties.

ECDF curves are created as well:

p2Merge %>%
    mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
    arrange(cluster, countySize, tdpm7) %>%
    group_by(cluster, countySize) %>%
    mutate(prop=row_number()/n()) %>%
    ungroup() %>%
    ggplot(aes(x=tdpm7, y=prop)) + 
    geom_line(aes(group=countySize, color=countySize), size=1) + 
    facet_wrap(~cluster) + 
    scale_color_brewer(palette="Set1") + 
    labs(x="Cumulative deaths per million", 
         y="ECDF", 
         title="Deaths vs shape segments", 
         subtitle="Includes all counties (even with 0 deaths)"
         )

p2Merge %>%
    mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
    filter(tdpm7>0) %>%
    arrange(cluster, countySize, tdpm7) %>%
    group_by(cluster, countySize) %>%
    mutate(prop=row_number()/n()) %>%
    ungroup() %>%
    ggplot(aes(x=tdpm7, y=prop)) + 
    geom_line(aes(group=cluster, color=cluster), size=1) + 
    facet_wrap(~countySize) + 
    scale_color_brewer(palette="Set1") + 
    labs(x="Cumulative deaths per million", 
         y="ECDF", 
         title="Deaths vs shape segments", 
         subtitle="Includes only counties with 1+ deaths"
         )

Larger counties appear to generally have had lower burdens, with the exception of very low burden, where there is some crossing of the curves at around 500 deaths per million.

Suppose that counties are clustered based on cumulative deaths (not percentage) by month, with focus on larger counties first:

# Include all counties with population greater than 0
p2DataAll <- cty_chksegs_20210608$plotDataList$dfFull %>% 
    select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
    pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>% 
    filter(!is.na(value), pop>=1) %>% 
    group_by(countyFIPS, name) %>% 
    mutate(pct=value/max(value)) %>% 
    ungroup()

# Create distance matrix and hierarchical clusters
p2DistValue100k <- p2DataAll %>% 
    filter(name=="tdpm7", pop>=100000) %>% 
    select(countyFIPS, date, value) %>% 
    pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="value") %>%
    column_to_rownames("countyFIPS") %>%
    as.matrix() %>%
    dist()

# Run clustering with method "complete"
p2CompleteValue100k <- hclust(p2DistValue100k, method="complete")
plot(p2CompleteValue100k)

There appears to be a branch of counties that are unique:

cutree(p2CompleteValue100k, k=2) %>% 
    vecToTibble(colNameName="countyFIPS", colNameData="cluster") %>%
    group_by(cluster) %>%
    mutate(n=n()) %>%
    ungroup() %>%
    filter(n==min(n)) %>%
    left_join(p2DataAll, by="countyFIPS") %>%
    left_join(select(getCountyData(), countyFIPS, countyName, state)) %>%
    filter(name=="tdpm7") %>%
    mutate(countyName=paste0(countyFIPS, " - ", 
                             stringr::str_replace(countyName, " County", ""), 
                             " (", 
                             state, 
                             ")"
                             )
           ) %>%
    ggplot(aes(x=date, y=value)) + 
    geom_line() + 
    facet_wrap(~countyName) + 
    labs(x=NULL, y="Deaths per million", title="Counties in the first, small segment")
## Joining, by = c("countyFIPS", "state")

These counties have high total burdens combined with shapes differing from the norm. This leads them to become their own clusters, requiring a greater number of clusters to get at the main groupings:

# Updated to allow for variable other than pct
plotHierarchical <- function(obj, k, df, keyVar="pct", facetScale="fixed") {
    
    # FUNCtION ARGuMENTS:
    # obj: clustering object from hierarchical clustering
    # k: number of clusters to create
    # df: data frame with burden data
    # keyVar: the key variable to use in df
    
    # Counts by cluster
    ctCluster <- cutree(obj, k=k) %>%
        table() %>%
        print()
    
    # Make clustering data frame
    clData <- cutree(obj, k=k) %>%
        clustersToFrame(colNameName="countyFIPS") %>%
        joinFrames(df) %>%
        group_by(countyFIPS, name) %>%
        mutate(delta=ifelse(row_number()==1, get(keyVar), get(keyVar)-lag(get(keyVar)))) %>%
        ungroup() %>%
        group_by(cluster, date, name) %>%
        summarize(p05cum=quantile(get(keyVar), 0.05), 
                  mucum=mean(get(keyVar)), 
                  p95cum=quantile(get(keyVar), 0.95), 
                  p05delta=quantile(delta, 0.05), 
                  mudelta=mean(delta), 
                  p95delta=quantile(delta, 0.95), 
                  .groups="drop"
                  )
    
    # Plot by cluster and metric
    p1 <- clData %>%
        ggplot(aes(x=date)) + 
        geom_line(aes(color=cluster, y=mucum), lwd=1) + 
        geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
        facet_grid(c("tcpm7"="Cumulative cases", "tdpm7"="Cumulative deaths")[name]~cluster, 
                   scales=facetScale
                   ) + 
        scale_color_brewer(palette="Set1") + 
        labs(x=NULL, y="Cumulative", title="Mean and 90% range by cluster and metric")
    print(p1)
    
    # Plot incremental rather than cumulative
    p2 <- clData %>%
        ggplot(aes(x=date)) + 
        geom_line(aes(color=cluster, y=mudelta), lwd=1) + 
        geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
        facet_grid(c("tcpm7"="Cases", "tdpm7"="Deaths")[name]~cluster, scales=facetScale) + 
        scale_color_brewer(palette="Set1") + 
        labs(x=NULL, y="Burden", title="Mean and 90% range by cluster and metric")
    print(p2)
    
    # Return the clustering frame
    clData
    
}

plotListValue100k <- lapply(2:9, 
                            FUN=function(x) plotHierarchical(p2CompleteValue100k, 
                                                             k=x, 
                                                             df=filter(p2DataAll, pop>=100000), 
                                                             keyVar="value", 
                                                             facetScale="free_y"
                                                             )
                            )
## .
##   1   2 
## 586  13
## Joining, by = "countyFIPS"

## .
##   1   2   3 
## 319 267  13
## Joining, by = "countyFIPS"

## .
##   1   2   3   4 
## 319 267   4   9
## Joining, by = "countyFIPS"

## .
##   1   2   3   4   5 
## 319 236   4  31   9
## Joining, by = "countyFIPS"

## .
##   1   2   3   4   5   6 
## 319 236   4  31   7   2
## Joining, by = "countyFIPS"

## .
##   1   2   3   4   5   6   7 
## 319  81 155   4  31   7   2
## Joining, by = "countyFIPS"

## .
##   1   2   3   4   5   6   7   8 
## 319  81 155   4  23   8   7   2
## Joining, by = "countyFIPS"

## .
##   1   2   3   4   5   6   7   8   9 
## 319  81 144   4  23  11   8   7   2
## Joining, by = "countyFIPS"

With 9 segments, the following shapes and burdens (using deaths as the metric) are noted:

Separating counties with very high burden and very low burden appears reasonable. Clustering can be re-run with only the counties showing moderate disease burden, where shape is a more meaningful factor.

Segments are selected using k=8, with the four smallest buckets consolidated:

dfClust100k_k8 <- cutree(p2CompleteValue100k, k=8) %>%
    vecToTibble(colNameData="tempCluster", colNameName="countyFIPS") %>%
    group_by(tempCluster) %>%
    mutate(cluster=factor(ifelse(n()>=20, tempCluster, 9))) %>%
    ungroup()

dfClust100k_k8
## # A tibble: 599 x 3
##    countyFIPS tempCluster cluster
##    <chr>            <int> <fct>  
##  1 01003                1 1      
##  2 01015                2 2      
##  3 01055                2 2      
##  4 01069                2 2      
##  5 01073                2 2      
##  6 01081                1 1      
##  7 01089                1 1      
##  8 01097                3 3      
##  9 01101                2 2      
## 10 01103                2 2      
## # ... with 589 more rows
count(dfClust100k_k8, cluster)
## # A tibble: 5 x 2
##   cluster     n
##   <fct>   <int>
## 1 1         319
## 2 2          81
## 3 3         155
## 4 9          21
## 5 5          23

The plotHierarchical function is updated to allow for passing clusters to it directly:

# Updated to allow for variable other than pct and obj that is of class data.frame
plotHierarchical <- function(obj, k, df, keyVar="pct", facetScale="fixed") {
    
    # FUNCtION ARGuMENTS:
    # obj: clustering object from hierarchical clustering OR df with countyFIPS-cluster
    # k: number of clusters to create
    # df: data frame with burden data
    # keyVar: the key variable to use in df
    
    # Convert obj to a data frame if obj has been passed as an hclust object
    if (!("data.frame") %in% class(obj)) {
        
        if (!("hclust") %in% class(obj)) 
            stop("\nMust pass object that is member of class data.frame or hclust\n")
        
        # Print counts by cluster
        cutree(obj, k=k) %>%
            table() %>%
            print()
        
        # Convert obj to relevant data frame
        obj <- cutree(obj, k=k) %>%
            clustersToFrame(colNameName="countyFIPS")
        
    }
    
    # Make clustering data frame (obj has been converted to data.frame above if not already passed that way)
    clData <- obj %>%
        joinFrames(df) %>%
        group_by(countyFIPS, name) %>%
        mutate(delta=ifelse(row_number()==1, get(keyVar), get(keyVar)-lag(get(keyVar)))) %>%
        ungroup() %>%
        group_by(cluster, date, name) %>%
        summarize(p05cum=quantile(get(keyVar), 0.05), 
                  mucum=mean(get(keyVar)), 
                  p95cum=quantile(get(keyVar), 0.95), 
                  p05delta=quantile(delta, 0.05), 
                  mudelta=mean(delta), 
                  p95delta=quantile(delta, 0.95), 
                  .groups="drop"
                  )
    
    # Plot by cluster and metric
    p1 <- clData %>%
        ggplot(aes(x=date)) + 
        geom_line(aes(color=cluster, y=mucum), lwd=1) + 
        geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
        facet_grid(c("tcpm7"="Cumulative cases", "tdpm7"="Cumulative deaths")[name]~cluster, 
                   scales=facetScale
                   ) + 
        scale_color_brewer(palette="Set1") + 
        labs(x=NULL, y="Cumulative", title="Mean and 90% range by cluster and metric")
    print(p1)
    
    # Plot incremental rather than cumulative
    p2 <- clData %>%
        ggplot(aes(x=date)) + 
        geom_line(aes(color=cluster, y=mudelta), lwd=1) + 
        geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
        facet_grid(c("tcpm7"="Cases", "tdpm7"="Deaths")[name]~cluster, scales=facetScale) + 
        scale_color_brewer(palette="Set1") + 
        labs(x=NULL, y="Burden", title="Mean and 90% range by cluster and metric")
    print(p2)
    
    # Return the clustering frame
    clData
    
}

clData_100k_k8 <- plotHierarchical(obj=select(dfClust100k_k8, countyFIPS, cluster), 
                                   df=p2DataAll, 
                                   keyVar="value", 
                                   facetScale="free_y"
                                   )
## Joining, by = "countyFIPS"

Next, the existing algorithm is updated to assign every cluster to the closest cluster mean:

tempDistCluster <- function(dfClust, dfAll, metType="tdpm7", metClust="mucum", typeUse="pct") {
    
    dfClust <- dfClust %>%
        filter(name==metType) %>%
        colSelector(vecSelect=c("cluster", "date", metClust)) %>%
        colRenamer(vecRename=c("keyMetric") %>% purrr::set_names(metClust))
    
    dfAll <- dfAll %>%
        filter(name==metType) %>%
        colSelector(vecSelect=c("countyFIPS", "date", typeUse)) %>%
        colRenamer(vecRename=c("keyMetric") %>% purrr::set_names(typeUse))
    
    dfClust %>%
        left_join(dfAll, by="date") %>%
        mutate(dx=ifelse(is.na(keyMetric.x), 0, keyMetric.x), 
               dy=ifelse(is.na(keyMetric.y), 0, keyMetric.y),
               delta2=(dx-dy)**2
               ) %>%
        group_by(cluster, countyFIPS) %>%
        summarize(dist=sum(delta2), .groups="drop") %>%
        arrange(countyFIPS, dist)
    
}

# Calculate distances to cluster mean, then print
countyClusterDist_new <- tempDistCluster(dfClust=clData_100k_k8, dfAll=p2DataAll, typeUse="value")
countyClusterDist_new
## # A tibble: 15,710 x 3
##    cluster countyFIPS        dist
##    <fct>   <chr>            <dbl>
##  1 3       01001         9569283.
##  2 2       01001        67090707.
##  3 1       01001        88240896.
##  4 5       01001       353564016.
##  5 9       01001       983953954.
##  6 1       01003        14296132.
##  7 3       01003        59333163.
##  8 2       01003       191161474.
##  9 5       01003       563092256.
## 10 9       01003      1367670103.
## # ... with 15,700 more rows
# Assess overlap of cluster for counties already clustered
tmpDistDF_new <- dfClust100k_k8 %>%
    select(countyFIPS, hierCluster=cluster) %>%
    left_join(countyClusterDist_new, by="countyFIPS") %>%
    arrange(countyFIPS, dist)
tmpDistDF_new %>%
    group_by(countyFIPS) %>%
    filter(row_number()==1) %>%
    ungroup() %>%
    mutate(cluster=factor(as.integer(as.character(cluster)))) %>%
    count(hierCluster, cluster) %>%
    group_by(hierCluster) %>%
    mutate(pct=n/sum(n)) %>%
    ungroup() %>%
    ggplot(aes(x=cluster, y=hierCluster)) + 
    geom_tile(aes(fill=pct)) +
    geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) + 
    scale_fill_continuous(low="white", high="lightgreen") +
    labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean", 
         x="Closest hierarchical cluster mean (k=5)", 
         y="Assigned hierarchical cluster (k=5)"
         )

While there are some minor differences, overwhelmingly counties are already aligned to the closest cluster mean. The approach is then extended to counties with populations under 100k:

# Full clustering database
distOnlyClust_new <- countyClusterDist_new %>%
    arrange(countyFIPS, dist) %>%
    group_by(countyFIPS) %>%
    filter(row_number()==1) %>%
    ungroup() %>%
    left_join(select(dfClust100k_k8, countyFIPS, hierCluster=cluster), by="countyFIPS")

# Cluster assignments depending on size of county
distOnlyClust_new %>%
    ggplot(aes(x=cluster)) + 
    geom_bar(fill="lightblue") + 
    geom_text(stat="count", aes(y=..count../2, label=..count..)) +
    facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"), 
               scales="free_y"
               ) + 
    labs(title="Closest hierarchical cluster")

# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust_new, fips=countyFIPS) %>% 
                           mutate(cluster=factor(cluster, levels=c(1, 3, 2, 9, 5))), 
                       brewPalette="viridis", 
                       caption="Closest hierarchical cluster mean (euclidean distance of tdpm7)"
                       ) + 
    labs(title="Lighter clusters generally have higher/earlier burden")

There continue to be meaningful geographic trends, with counties frequently adjacent to other counties in the same segment. Smaller counties are over-represented in groups with relatively high burden (9 and 2).

Next steps are to try to bring the shape dimension slightly more to the surface (in particular for segments 2 and 3 which have a fat 90% range early in the pandemic).

Data for segments 2 and 3 are extracted, then clustering is run solely for these groups:

# Create data containing only clusters 2 and 3
p2Data_Seg_23 <- distOnlyClust_new %>%
    filter(hierCluster %in% c(2, 3)) %>%
    select(countyFIPS, hierCluster) %>%
    left_join(p2DataAll, by="countyFIPS")
p2Data_Seg_23
## # A tibble: 234,112 x 8
##    countyFIPS hierCluster    pop state date       name  value   pct
##    <chr>      <fct>        <dbl> <chr> <date>     <chr> <dbl> <dbl>
##  1 01015      2           113605 AL    2020-01-25 tdpm7     0     0
##  2 01015      2           113605 AL    2020-01-25 tcpm7     0     0
##  3 01015      2           113605 AL    2020-01-26 tdpm7     0     0
##  4 01015      2           113605 AL    2020-01-26 tcpm7     0     0
##  5 01015      2           113605 AL    2020-01-27 tdpm7     0     0
##  6 01015      2           113605 AL    2020-01-27 tcpm7     0     0
##  7 01015      2           113605 AL    2020-01-28 tdpm7     0     0
##  8 01015      2           113605 AL    2020-01-28 tcpm7     0     0
##  9 01015      2           113605 AL    2020-01-29 tdpm7     0     0
## 10 01015      2           113605 AL    2020-01-29 tcpm7     0     0
## # ... with 234,102 more rows
# Create distance matrix and hierarchical clusters
p2Dist_Seg_23 <- p2Data_Seg_23 %>% 
    filter(name=="tdpm7") %>% 
    select(countyFIPS, date, value) %>% 
    pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="value") %>%
    column_to_rownames("countyFIPS") %>%
    as.matrix() %>%
    dist()

# Run clustering with method "complete"
p2Complete_Seg_23 <- hclust(p2Dist_Seg_23, method="complete")
plot(p2Complete_Seg_23)

# Plot cluster summaries
plotList_Seg_23 <- lapply(2:5, 
                          FUN=function(x) plotHierarchical(p2Complete_Seg_23, 
                                                           k=x, 
                                                           df=filter(p2Data_Seg_23, pop>=100000), 
                                                           keyVar="value", 
                                                           facetScale="free_y"
                                                           )
                          )
## .
##   1   2 
##  81 155
## Joining, by = "countyFIPS"

## .
##   1   2   3 
##  81 144  11
## Joining, by = "countyFIPS"

## .
##   1   2   3   4 
##  48 144  33  11
## Joining, by = "countyFIPS"

## .
##   1   2   3   4   5 
##  46   2 144  33  11
## Joining, by = "countyFIPS"

# Create the four cluster summary
dfClust_Seg_23_k4 <- cutree(p2Complete_Seg_23, k=4) %>%
    vecToTibble(colNameData="tempCluster", colNameName="countyFIPS") %>%
    mutate(cluster=factor(tempCluster))

# Comparison to original hierarchical cluster
dfClust_Seg_23_k4 %>%
    left_join(select(distOnlyClust_new, countyFIPS, hierCluster), by="countyFIPS") %>%
    count(hierCluster, cluster)
## # A tibble: 4 x 3
##   hierCluster cluster     n
##   <fct>       <fct>   <int>
## 1 2           1          48
## 2 2           3          33
## 3 3           2         144
## 4 3           4          11
# Map of the new clusters
dfClust_Seg_23_k4 %>%
    select(fips=countyFIPS, cluster) %>%
    usmap::plot_usmap(regions="counties", data=., values="cluster") + 
    scale_fill_brewer("Cluster", palette="Set1")

# Plot the four cluster summary
clData_Seg_23_k4 <- plotHierarchical(obj=select(dfClust_Seg_23_k4, countyFIPS, cluster), 
                                     df=p2Data_Seg_23, 
                                     keyVar="value", 
                                     facetScale="free_y"
                                     )
## Joining, by = "countyFIPS"

As expected, each of the original segments is split, helping to bring out a mix of shape and total burden:

While small, there is meaningful value in the ~2000 deaths per million cluster with early disease. It is focused on major metros (Chicago, Detroit, Philadelphia, New Orleans, NYC collar) that had early spikes similar to core NYC counties but without the associated high levels of total burden. These are distinct from the much larger group of counties that got to ~2000 deaths per million with a much later timing.

Next steps are to create a full clustering database and extend to counties with under 100k population.

First, the segments for counties with 100k+ are combined:

dfClust100k_full <- dfClust100k_k8 %>%
    select(countyFIPS, origCluster=cluster) %>%
    full_join(select(dfClust_Seg_23_k4, countyFIPS, newCluster=cluster), by="countyFIPS") %>%
    mutate(finalCluster=ifelse(is.na(newCluster), 
                               as.integer(as.character(origCluster)), 
                               10*as.integer(as.character(origCluster)) + as.integer(as.character(newCluster))
                               ), 
           finalCluster=factor(finalCluster, levels=sort(unique(finalCluster)))
           )
dfClust100k_full
## # A tibble: 599 x 4
##    countyFIPS origCluster newCluster finalCluster
##    <chr>      <fct>       <fct>      <fct>       
##  1 01003      1           <NA>       1           
##  2 01015      2           1          21          
##  3 01055      2           1          21          
##  4 01069      2           1          21          
##  5 01073      2           1          21          
##  6 01081      1           <NA>       1           
##  7 01089      1           <NA>       1           
##  8 01097      3           2          32          
##  9 01101      2           3          23          
## 10 01103      2           1          21          
## # ... with 589 more rows
dfClust100k_full %>%
    count(origCluster, newCluster, finalCluster)
## # A tibble: 7 x 4
##   origCluster newCluster finalCluster     n
##   <fct>       <fct>      <fct>        <int>
## 1 1           <NA>       1              319
## 2 2           1          21              48
## 3 2           3          23              33
## 4 3           2          32             144
## 5 3           4          34              11
## 6 9           <NA>       9               21
## 7 5           <NA>       5               23
# Plot the seven cluster summary
clData_Seg_100k_full <- plotHierarchical(obj=select(dfClust100k_full, countyFIPS, cluster=finalCluster), 
                                     df=p2DataAll, 
                                     keyVar="value", 
                                     facetScale="free_y"
                                     )
## Joining, by = "countyFIPS"

# Map of the new clusters
dfClust100k_full %>%
    select(fips=countyFIPS, cluster=finalCluster) %>%
    usmap::plot_usmap(regions="counties", data=., values="cluster") + 
    scale_fill_brewer("Cluster", palette="Set1")

Smaller counties are then mapped to the nearest segment average:

# Calculate distances to cluster mean, then print
countyClusterDist_100k_full <- tempDistCluster(dfClust=clData_Seg_100k_full, dfAll=p2All, typeUse="value")
countyClusterDist_100k_full
## # A tibble: 21,994 x 3
##    cluster countyFIPS       dist
##    <fct>   <chr>           <dbl>
##  1 32      01001        6770143.
##  2 21      01001       46397990.
##  3 1       01001       88240896.
##  4 34      01001      123376092.
##  5 23      01001      125257271.
##  6 5       01001      353564016.
##  7 9       01001      983953954.
##  8 1       01003       14296132.
##  9 32      01003       51092603.
## 10 21      01003      143224481.
## # ... with 21,984 more rows
# Assess overlap of cluster for counties already clusterd
tmpDistDF_100k_full <- dfClust100k_full %>%
    select(countyFIPS, finalCluster) %>%
    left_join(countyClusterDist_100k_full, by="countyFIPS") %>%
    arrange(countyFIPS, dist)
tmpDistDF_100k_full %>%
    group_by(countyFIPS) %>%
    filter(row_number()==1) %>%
    ungroup() %>%
    mutate(cluster=factor(as.integer(as.character(cluster)))) %>%
    count(hierCluster=finalCluster, cluster) %>%
    group_by(hierCluster) %>%
    mutate(pct=n/sum(n)) %>%
    ungroup() %>%
    ggplot(aes(x=cluster, y=hierCluster)) + 
    geom_tile(aes(fill=pct)) +
    geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) + 
    scale_fill_continuous(low="white", high="lightgreen") +
    labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean", 
         x="Closest hierarchical cluster mean (k=5)", 
         y="Assigned hierarchical cluster (k=5)"
         )

# Full clustering database
distOnlyClust_100k_full <- countyClusterDist_100k_full %>%
    arrange(countyFIPS, dist) %>%
    group_by(countyFIPS) %>%
    filter(row_number()==1) %>%
    ungroup() %>%
    left_join(select(dfClust100k_full, countyFIPS, hierCluster=finalCluster), by="countyFIPS")

# Cluster assignments depending on size of county
distOnlyClust_100k_full %>%
    ggplot(aes(x=cluster)) + 
    geom_bar(fill="lightblue") + 
    geom_text(stat="count", aes(y=..count../2, label=..count..)) +
    facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"), 
               scales="free_y"
               ) + 
    labs(title="Closest hierarchical cluster")

# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust_100k_full, fips=countyFIPS) %>% 
                           mutate(cluster=factor(cluster, levels=c(1, 32, 34, 21, 23, 5, 9))), 
                       brewPalette="viridis", 
                       caption="Closest hierarchical cluster mean (euclidean distance of % cumulative death)"
                       ) + 
    labs(title="Lighter clusters have higher burden and/or higher percentage of burden early")

There continues to be meaningful patterns by geography. Next steps are to download the latest data and run the full segment diagnostics.

New data are downloaded, with the above segments applied:

readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210622.csv", 
                 "usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210622.csv"
                 )
compareList <- list("usafCase"=cty_newdata_20210608$dfRaw$usafCase, 
                    "usafDeath"=cty_newdata_20210608$dfRaw$usafDeath
                    )
dfClust <- distOnlyClust_100k_full %>%
    select(countyFIPS, cluster) %>%
    mutate(cluster=as.character(cluster)) %>%
    left_join(getCountyData(), by="countyFIPS") %>%
    mutate(origCluster=cluster, cluster=ifelse(pop>=25000, cluster, "999"))
useClusters <- as.character(dfClust$cluster) %>% 
    purrr::set_names(dfClust$countyFIPS)

# Create new clusters
cty_newdata_20210622 <- readRunUSAFacts(maxDate="2021-06-20", 
                                        downloadTo=lapply(readList, 
                                                          FUN=function(x) if(file.exists(x)) NA else x
                                                          ),
                                        readFrom=readList, 
                                        compareFile=compareList, 
                                        writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log", 
                                        ovrwriteLog=TRUE,
                                        useClusters=useClusters,
                                        skipAssessmentPlots=FALSE,
                                        brewPalette="Paired"
                                        )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
## 
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS 
## 
## 
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 14
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
## 
## Checking for similarity of: county
## In reference but not in current: 
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.

## 
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS 
## 
## 
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 14
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
## 
## Checking for similarity of: county
## In reference but not in current: 
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log

## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
## 
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
##   isType   cases new_cases            n
##   <chr>    <dbl>     <dbl>        <dbl>
## 1 before 6.66e+9   3.29e+7 1647588     
## 2 after  6.63e+9   3.28e+7 1621272     
## 3 pctchg 4.52e-3   3.78e-3       0.0160
## 
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
##   isType  deaths   new_deaths            n
##   <chr>    <dbl>        <dbl>        <dbl>
## 1 before 1.34e+8 595239       1647588     
## 2 after  1.34e+8 593224       1621272     
## 3 pctchg 3.70e-3      0.00339       0.0160

## NULL

# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210622$useClusters, 
                       caption="Includes only counties with 25k+ population",
                       brewPalette="viridis"
                       )

saveToRDS(cty_newdata_20210622, ovrWriteError=FALSE)

The updated county clustering routines appear to work as intended. Next steps are to update the CDC excess deaths process.

The latest data are downloaded, with existing clusters applied:

readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210813.csv", 
                 "usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210813.csv"
                 )
compareList <- list("usafCase"=readFromRDS("cty_newdata_20210622")$dfRaw$usafCase, 
                    "usafDeath"=readFromRDS("cty_newdata_20210622")$dfRaw$usafDeath
                    )

# Create new clusters
cty_newdata_20210813 <- readRunUSAFacts(maxDate="2021-08-11", 
                                        downloadTo=lapply(readList, 
                                                          FUN=function(x) if(file.exists(x)) NA else x
                                                          ),
                                        readFrom=readList, 
                                        compareFile=compareList, 
                                        writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log", 
                                        ovrwriteLog=TRUE,
                                        useClusters=readFromRDS("cty_newdata_20210622")$useClusters,
                                        skipAssessmentPlots=FALSE,
                                        brewPalette="Paired"
                                        )
## 
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210813.csv
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
## 
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS 
## 
## 
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 51
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
## 
## Checking for similarity of: county
## In reference but not in current: 
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
## 
## 
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210813.csv
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.

## 
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS 
## 
## 
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 51
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
## 
## Checking for similarity of: county
## In reference but not in current: 
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log

## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
## 
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
##   isType   cases new_cases            n
##   <chr>    <dbl>     <dbl>        <dbl>
## 1 before 8.38e+9   3.54e+7 1810431     
## 2 after  8.34e+9   3.52e+7 1781514     
## 3 pctchg 4.49e-3   5.53e-3       0.0160
## 
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
##   isType  deaths   new_deaths            n
##   <chr>    <dbl>        <dbl>        <dbl>
## 1 before 1.65e+8 611955       1810431     
## 2 after  1.64e+8 607482       1781514     
## 3 pctchg 3.93e-3      0.00731       0.0160

## NULL

# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210813$useClusters, 
                       caption="Includes only counties with 25k+ population",
                       brewPalette="viridis"
                       )

# Dates with large amounts of negative restatement
cty_newdata_20210813$dfPerCapita %>% 
    group_by(date) %>% 
    summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>% 
    arrange(-nNeg)
## # A tibble: 567 x 4
##    date        nNeg  nPos new_cases
##    <date>     <int> <int>     <dbl>
##  1 2021-07-09   310  1797   -399248
##  2 2020-12-28   266  2372    216179
##  3 2020-12-25   262  1350     49223
##  4 2020-10-24   256  2230     53241
##  5 2021-06-11   205  1706     -9167
##  6 2021-06-23   191  1629     15003
##  7 2021-06-18   184  1442     20155
##  8 2021-06-25   183  1545     24105
##  9 2021-04-16   174  2237     65229
## 10 2021-06-30   174  1610     10814
## # ... with 557 more rows
# Restatements taking place between July 4-14, 2021
cty_newdata_20210813$dfPerCapita %>% 
    group_by(date) %>% 
    summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>% 
    filter(date >= "2021-07-04", date <= "2021-07-14")
## # A tibble: 11 x 4
##    date        nNeg  nPos new_cases
##    <date>     <int> <int>     <dbl>
##  1 2021-07-04     8   333      2737
##  2 2021-07-05    12   395      4228
##  3 2021-07-06    57  1617     27035
##  4 2021-07-07    97  1756     21302
##  5 2021-07-08   129  1613     19276
##  6 2021-07-09   310  1797   -399248
##  7 2021-07-10    17   653    452834
##  8 2021-07-11     8   362      3435
##  9 2021-07-12    59  1596     37864
## 10 2021-07-13    89  1588     21977
## 11 2021-07-14    97  1984     32297
# Restatements taking place between December 21, 2020 and January 6, 2021
cty_newdata_20210813$dfPerCapita %>% 
    group_by(date) %>% 
    summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>% 
    filter(date >= "2020-12-21", date <= "2021-01-06")
## # A tibble: 17 x 4
##    date        nNeg  nPos new_cases
##    <date>     <int> <int>     <dbl>
##  1 2020-12-21     8  2659    362667
##  2 2020-12-22     2  2514    174503
##  3 2020-12-23     0  2567    227200
##  4 2020-12-24     3  2345    185122
##  5 2020-12-25   262  1350     49223
##  6 2020-12-26    33  1964    170262
##  7 2020-12-27     5  2018    121643
##  8 2020-12-28   266  2372    216179
##  9 2020-12-29     5  2510    229421
## 10 2020-12-30    20  2698    238917
## 11 2020-12-31    32  2406    222938
## 12 2021-01-01     4  1603    190655
## 13 2021-01-02     0  2346    283820
## 14 2021-01-03     8  2470    229336
## 15 2021-01-04     4  2484    169859
## 16 2021-01-05     0  2773    238758
## 17 2021-01-06     0  2885    256350
saveToRDS(cty_newdata_20210813, ovrWriteError=FALSE)

There is a very large restatement of new_cases data occurring on or around July 8-11, 2021. Perhaps smoothing can be considered where each value for July 8-11, 2021 data is treated as the average across those four days. There is a similar though smaller anomaly in the December 24-29, 2020 range.

Dates are examined for those with the largest number of negative revisions in new_cases and/or new_deaths:

# All restatements
plotData <- cty_newdata_20210813$dfPerCapita %>% 
    select(date, new_cases, new_deaths, cpm7, dpm7) %>%
    pivot_longer(-date) %>%
    filter(!is.na(value)) %>%
    group_by(date, name) %>% 
    summarize(nNeg=sum(value <= -1), sumNeg=sum(ifelse(value < 0, value, 0)), .groups="drop")

tempMapper <- c("cpm7"="Cases per million (7-day mean)", 
                "dpm7"="Deaths per million (7-day mean)", 
                "new_cases"="New cases", 
                "new_deaths"="New deaths"
                )

plotData %>%
    ggplot(aes(x=date, y=nNeg)) + 
    geom_line() + 
    facet_wrap(~tempMapper[name]) + 
    labs(x=NULL, 
         y="Number of records", 
         title="Counties reporting less than or equal to -1 for metric on day"
         )

plotData %>%
    ggplot(aes(x=date, y=sumNeg)) + 
    geom_line() + 
    facet_wrap(~tempMapper[name], scales="free_y") + 
    labs(x=NULL, 
         y="Sum of value for counties recording negative", 
         title="Counties reporting less than 0 for metric on day"
         )

For records with negative values, the rolling-7 is calculated around the day of the negative:

# All restatements
plotData <- cty_newdata_20210813$dfPerCapita %>% 
    select(date, countyFIPS, new_cases, new_deaths, cpm7, dpm7) %>%
    pivot_longer(-c(date, countyFIPS)) %>%
    filter(!is.na(value)) %>%
    group_by(countyFIPS, name) %>% 
    arrange(date) %>%
    mutate(value7=zoo::rollmean(value, k=7, fill=NA), 
           isNeg=(value <= -1), 
           isNeg7=(is.na(value7) | value7 < 0)
           ) %>%
    group_by(date, name) %>%
    summarize(nNeg=sum(isNeg & isNeg7), sumNeg=sum(ifelse(isNeg & isNeg7, value, 0)), .groups="drop")

tempMapper <- c("cpm7"="Cases per million (7-day mean)", 
                "dpm7"="Deaths per million (7-day mean)", 
                "new_cases"="New cases", 
                "new_deaths"="New deaths"
                )

plotData %>%
    ggplot(aes(x=date, y=nNeg)) + 
    geom_line() + 
    facet_wrap(~tempMapper[name]) + 
    labs(x=NULL, 
         y="Number of records", 
         title="Counties reporting negative value that are also negative over the 7-day window"
         )

plotData %>%
    ggplot(aes(x=date, y=sumNeg)) + 
    geom_line() + 
    facet_wrap(~tempMapper[name], scales="free_y") + 
    labs(x=NULL, 
         y="Sum of value", 
         title="Counties reporting less than 0 for metric on day that are also negative over the 7-day window"
         )

cty_newdata_20210813$dfPerCapita %>% 
    select(date, countyFIPS, new_cases, new_deaths, cpm7, dpm7) %>%
    pivot_longer(-c(date, countyFIPS)) %>%
    filter(!is.na(value)) %>%
    group_by(countyFIPS, name) %>% 
    arrange(date) %>%
    mutate(value7=zoo::rollmean(value, k=7, fill=NA), 
           isNeg=(value <= -1), 
           isNeg7=(is.na(value7) | value7 < 0)
    ) %>% 
    ungroup() %>% 
    group_by(name, isNeg, isNeg7) %>% 
    summarize(n=n(), sumValue=sum(value))
## `summarise()` has grouped output by 'name', 'isNeg'. You can override using the `.groups` argument.
## # A tibble: 16 x 5
## # Groups:   name, isNeg [8]
##    name       isNeg isNeg7       n   sumValue
##    <chr>      <lgl> <lgl>    <int>      <dbl>
##  1 cpm7       FALSE FALSE  1701869 336714571.
##  2 cpm7       FALSE TRUE     45946   3248568.
##  3 cpm7       TRUE  FALSE     3859   -265077.
##  4 cpm7       TRUE  TRUE     10988  -2790977.
##  5 dpm7       FALSE FALSE  1627092   6667360.
##  6 dpm7       FALSE TRUE    126669     20031.
##  7 dpm7       TRUE  FALSE      678     -4144.
##  8 dpm7       TRUE  TRUE      8223   -102243.
##  9 new_cases  FALSE FALSE  1732888  35567937 
## 10 new_cases  FALSE TRUE     30335    362101 
## 11 new_cases  TRUE  FALSE    14909   -600467 
## 12 new_cases  TRUE  TRUE      3382   -114828 
## 13 new_deaths FALSE FALSE  1749115    616693 
## 14 new_deaths FALSE TRUE     27664      2807 
## 15 new_deaths TRUE  FALSE     3087     -7199 
## 16 new_deaths TRUE  TRUE      1648     -4819

Making an adjustment for situations where a metrics is negative but the 7-day total is non-negative can help to smooth some restatement anomalies. There sill still be restatements in the data, and these seem to increasein the more recent months of the data.

Next steps are to build a function to manage the smoothing, with a goal to incorporate this as part of per-capita processing.

A function is built to take an ordered vector and to replace negative values, and all of the surrounding n values, with the mean of those n values:

smoothNegativeValues <- function(x, 
                                 negTol=-0.001, 
                                 windowSize=3, 
                                 windowAlign="center", 
                                 ...
                                 ) {
    
    # FUNCTION ARGUMENTS:
    # x: a vector that has been sorted such that successive observations are sequential
    # negTol: tolerance for considering a value in x to be negative
    # windowSize: the size for the window to use
    # windowAlign: the alignment for the window (passed to zoo::rollmean)
    # ...: any other arguments to pass to zoo::rollmean
    
    # Get the rolling means for x
    muRoll <- zoo::rollmean(x, k=windowSize, fill=NA, align=windowAlign, ...)
    
    # Find the negative values and those that can be replaced
    negVals <- which(x < negTol)
    okFix <- which(!is.na(muRoll) & muRoll >= negTol)
    negFix <- intersect(negVals, okFix)
    
    # Replace the negative values with the rolling mean
    # CAUTION: this will double over-write if negative values are very close to each other
    # CAUTION: only works for centering for the time being
    eachSide <- ceiling((windowSize-1)/2)
    for (idxFix in negFix) x[(idxFix-eachSide):(idxFix+eachSide)] <- muRoll[idxFix]
    
    # Return the updated vector
    x
    
}

# Simple example, works well
smoothNegativeValues(c(1:5, 10, -2, 10, 5:1))
##  [1] 1 2 3 4 5 6 6 6 5 4 3 2 1
# More complex example, double overwriting leads to incorrect vector sum
smoothNegativeValues(c(1:5, 14, -2, -2, 14, 5:1)) %>% round(1)
##  [1] 1.0 2.0 3.0 4.0 5.0 3.3 3.3 3.3 3.3 5.0 4.0 3.0 2.0 1.0
# All counties with negatives
allNegCases <- cty_newdata_20210813$dfPerCapita %>% 
    select(countyFIPS, state, date, new_cases) %>% 
    group_by(countyFIPS) %>% 
    filter(new_cases <= -0.5) %>% 
    ungroup() %>% 
    arrange(new_cases) 
allNegCases
## # A tibble: 18,291 x 4
##    countyFIPS state date       new_cases
##    <chr>      <chr> <date>         <dbl>
##  1 48113      TX    2021-07-09    -43526
##  2 48439      TX    2021-07-09    -43415
##  3 48029      TX    2021-07-09    -41518
##  4 48215      TX    2021-07-09    -30342
##  5 55053      WI    2020-12-27    -21000
##  6 48121      TX    2021-07-09    -20662
##  7 48029      TX    2020-12-25    -18856
##  8 48085      TX    2021-07-09    -17026
##  9 06037      CA    2020-12-25    -12459
## 10 48355      TX    2021-07-09    -12254
## # ... with 18,281 more rows
allNegCases %>%
    summarize(n=n(), new_cases=sum(new_cases))
## # A tibble: 1 x 2
##       n new_cases
##   <int>     <dbl>
## 1 18291   -715295
# Function to check data in a key county
countyCheck <- function(df, keyDate, keyFIPS, windowSize=7) {

    if (!("Date" %in% class(keyDate))) keyDate <- as.Date(keyDate)
    
    ctyData <- df %>% 
        filter(countyFIPS==keyFIPS) 
    ctyData %>%
        filter(round(new_cases) < 0) %>%
        print()
    ctyData <- ctyData %>%
        mutate(smoothCases=smoothNegativeValues(new_cases, windowSize=windowSize))
    ctyData %>%
        filter(date >= keyDate - ceiling((windowSize-1)/2), date <= keyDate + ceiling((windowSize-1)/2)) %>%
        select(countyFIPS, state, date, cases, new_cases, smoothCases) %>%
        print()
    p1 <- ctyData %>%
        select(countyFIPS, state, date, new_cases, smoothCases) %>%
        pivot_longer(-c(countyFIPS, state, date)) %>%
        ggplot(aes(x=date, y=value)) + 
        geom_line(aes(group=name, color=name)) + 
        facet_wrap(~c("new_cases"="Raw cases", "smoothCases"="Smoothed cases")[name]) +
        labs(x=NULL, y="Cases", title=paste0("Cases data for county ", keyFIPS)) + 
        theme(legend.position="none")
    print(p1)
    ctyData %>%
        select(countyFIPS, state, date, new_cases, smoothCases) %>%
        summarize(across(where(is.numeric), sum, na.rm=TRUE)) %>%
        print()
    
}

# Data for countyFIPS 48113 (Texas, negative values on July 9)
countyCheck(cty_newdata_20210813$dfPerCapita, keyDate="2021-07-09", keyFIPS="48113")
## # A tibble: 1 x 15
##   countyFIPS state date        cases new_cases deaths new_deaths    tcpm     cpm
##   <chr>      <chr> <date>      <dbl>     <dbl>  <dbl>      <dbl>   <dbl>   <dbl>
## 1 48113      TX    2021-07-09 263826    -43526   4139          4 100104. -16515.
## # ... with 6 more variables: tdpm <dbl>, dpm <dbl>, tcpm7 <dbl>, cpm7 <dbl>,
## #   tdpm7 <dbl>, dpm7 <dbl>
## # A tibble: 7 x 6
##   countyFIPS state date        cases new_cases smoothCases
##   <chr>      <chr> <date>      <dbl>     <dbl>       <dbl>
## 1 48113      TX    2021-07-06 306870         0        147.
## 2 48113      TX    2021-07-07 307033       163        147.
## 3 48113      TX    2021-07-08 307352       319        147.
## 4 48113      TX    2021-07-09 263826    -43526        147.
## 5 48113      TX    2021-07-10 307901     44075        147.
## 6 48113      TX    2021-07-11 307901         0        147.
## 7 48113      TX    2021-07-12 307901         0        147.

## # A tibble: 1 x 2
##   new_cases smoothCases
##       <dbl>       <dbl>
## 1    323669      323669
# Examples with repeating negative values
repNegCases <- cty_newdata_20210813$dfPerCapita %>% 
    select(countyFIPS, state, date, new_cases) %>% 
    group_by(countyFIPS) %>% 
    filter(new_cases <= -0.5, (lag(new_cases) <= -0.5 | lead(new_cases) <= -0.5)) %>% 
    ungroup() %>% 
    arrange(new_cases)
repNegCases
## # A tibble: 2,278 x 4
##    countyFIPS state date       new_cases
##    <chr>      <chr> <date>         <dbl>
##  1 48471      TX    2021-07-09      -870
##  2 48325      TX    2021-06-12      -406
##  3 21235      KY    2021-06-15      -390
##  4 48025      TX    2021-07-09      -390
##  5 48185      TX    2021-07-09      -390
##  6 21235      KY    2021-06-16      -380
##  7 48201      TX    2021-06-17      -345
##  8 06025      CA    2021-06-30      -336
##  9 48069      TX    2021-07-09      -314
## 10 21235      KY    2021-06-12      -309
## # ... with 2,268 more rows
repNegCases %>%
    summarize(n=n(), new_cases=sum(new_cases))
## # A tibble: 1 x 2
##       n new_cases
##   <int>     <dbl>
## 1  2278    -16455
# Data for countyFIPS 48471 (Texas, repeating negative values on/around July 9)
countyCheck(cty_newdata_20210813$dfPerCapita, keyDate="2021-07-09", keyFIPS="48471")
## # A tibble: 52 x 15
##    countyFIPS state date       cases new_cases deaths new_deaths   tcpm     cpm
##    <chr>      <chr> <date>     <dbl>     <dbl>  <dbl>      <dbl>  <dbl>   <dbl>
##  1 48471      TX    2020-08-05  2838        -4     40          2 38892.   -54.8
##  2 48471      TX    2020-08-16  3189        -1     43          0 43702.   -13.7
##  3 48471      TX    2020-09-03  3417      -246     48          0 46827. -3371. 
##  4 48471      TX    2020-09-04  3415        -2     48          0 46799.   -27.4
##  5 48471      TX    2020-09-09  3403      -451     51          1 46635. -6181. 
##  6 48471      TX    2020-09-12  3370       -46     52          0 46183.  -630. 
##  7 48471      TX    2020-09-17  3567      -119     53          0 48882. -1631. 
##  8 48471      TX    2020-09-23  3690        -3     56          0 50568.   -41.1
##  9 48471      TX    2020-09-24  3689        -1     57          1 50554.   -13.7
## 10 48471      TX    2020-10-07  3788       -31     57          0 51911.  -425. 
## # ... with 42 more rows, and 6 more variables: tdpm <dbl>, dpm <dbl>,
## #   tcpm7 <dbl>, cpm7 <dbl>, tdpm7 <dbl>, dpm7 <dbl>
## # A tibble: 7 x 6
##   countyFIPS state date       cases new_cases smoothCases
##   <chr>      <chr> <date>     <dbl>     <dbl>       <dbl>
## 1 48471      TX    2021-07-06  8703         2        3.29
## 2 48471      TX    2021-07-07  8757        54        3.29
## 3 48471      TX    2021-07-08  8718       -39        3.29
## 4 48471      TX    2021-07-09  7848      -870        3.29
## 5 48471      TX    2021-07-10  8718       870        3.29
## 6 48471      TX    2021-07-11  8721         3        1.29
## 7 48471      TX    2021-07-12  8724         3        1.29

## # A tibble: 1 x 2
##   new_cases smoothCases
##       <dbl>       <dbl>
## 1      9240       9393.

Curing the single negative value issue addresses over 95% of the observed issue with new_cases.

Despite the limitations, this appears promising for further exploration. The function is applied to the full per-capita dataset:

# Smoothed cases for all counties
perCapTemp <- cty_newdata_20210813$dfPerCapita %>%
    select(countyFIPS, state, date, cases, new_cases) %>%
    group_by(countyFIPS, state) %>%
    mutate(smoothCases=smoothNegativeValues(new_cases, windowSize=7), 
           cumNew=cumsum(new_cases), 
           cumSmooth=cumsum(smoothCases), 
           smooth7=zoo::rollmean(smoothCases, k=7, fill=NA), 
           cases7=zoo::rollmean(new_cases, k=7, fill=NA)
           ) %>%
    ungroup()
perCapTemp
## # A tibble: 1,781,514 x 10
##    countyFIPS state date       cases new_cases smoothCases cumNew cumSmooth
##    <chr>      <chr> <date>     <dbl>     <dbl>       <dbl>  <dbl>     <dbl>
##  1 01001      AL    2020-01-22     0         0           0      0         0
##  2 01003      AL    2020-01-22     0         0           0      0         0
##  3 01005      AL    2020-01-22     0         0           0      0         0
##  4 01007      AL    2020-01-22     0         0           0      0         0
##  5 01009      AL    2020-01-22     0         0           0      0         0
##  6 01011      AL    2020-01-22     0         0           0      0         0
##  7 01013      AL    2020-01-22     0         0           0      0         0
##  8 01015      AL    2020-01-22     0         0           0      0         0
##  9 01017      AL    2020-01-22     0         0           0      0         0
## 10 01019      AL    2020-01-22     0         0           0      0         0
## # ... with 1,781,504 more rows, and 2 more variables: smooth7 <dbl>,
## #   cases7 <dbl>
perCapTemp %>%
    summarize(across(where(is.numeric), sum, na.rm=TRUE))
## # A tibble: 1 x 7
##        cases new_cases smoothCases     cumNew   cumSmooth   smooth7    cases7
##        <dbl>     <dbl>       <dbl>      <dbl>       <dbl>     <dbl>     <dbl>
## 1 8343470745  35214743    35220770 8343470745 8344928995. 34880628. 34874611.
plotTemp <- perCapTemp %>%
    group_by(date) %>%
    summarize(across(where(is.numeric), specNA(sum))) %>%
    pivot_longer(-date) 

# Plot #1: Cumulative cases by technique
plotTemp %>%
    filter(name %in% c("cases", "cumNew", "cumSmooth")) %>%
    ggplot(aes(x=date, y=value/1000000)) + 
    geom_line(aes(group=name, 
                  color=c("cases"="Cases", "cumNew"="Sum new_cases", "cumSmooth"="Sum smoothed cases")[name]
                  )
              ) + 
    labs(x=NULL, y="Cumulative Cases (millions)", title="Cumulative case evolution by approach") + 
    scale_color_discrete("Technique") + 
    theme(legend.position="bottom")

# Plot #2: new_cases vs smoothed cases
plotTemp %>%
    filter(name %in% c("new_cases", "smoothCases")) %>%
    ggplot(aes(x=date, y=value/1000)) + 
    geom_line(aes(group=name, 
                  color=c("new_cases"="Raw new_cases", "smoothCases"="Smoothed cases")[name]
                  )
              ) + 
    labs(x=NULL, y="New Cases (000s)", title="New case evolution by approach") + 
    scale_color_discrete("Technique") + 
    theme(legend.position="bottom")

# Plot #3: Rolling-7 mean for new_cases vs smoothed cases
plotTemp %>%
    filter(name %in% c("cases7", "smooth7")) %>%
    ggplot(aes(x=date, y=value/1000)) + 
    geom_line(aes(group=name, 
                  color=c("cases7"="new_cases", "smooth7"="Smoothed cases")[name]
                  )
              ) + 
    labs(x=NULL, y="New Cases (000s) - rolling 7-day mean", title="New case evolution by approach") + 
    scale_color_discrete("Technique") + 
    theme(legend.position="bottom")
## Warning: Removed 12 row(s) containing missing values (geom_path).

At a national level, the smoothing significantly helps with the July negative/positive restatement, as well as providing some assistance in other time periods. Plots are made for each state using the smoothed data:

perCapTemp %>%
    group_by(state, date) %>%
    summarize(smooth7=specNA(sum)(smooth7), .groups="drop") %>%
    ggplot(aes(x=date, y=smooth7/1000)) + 
    geom_line() + 
    labs(x=NULL, y="Cases (000s)", title="Smoothed rolling-7 mean cases per day by state") + 
    facet_wrap(~state, scales="free_y")
## Warning: Removed 6 row(s) containing missing values (geom_path).

Several artifacts are observed, including:

These can be further explored as a next step, with particular focus on Florida (recency) and the late 2020 pattern observed across multiple states.